library(devtools)
library(rgdal)
library(GGally)
library(ggplot2)
library(plotly)
library(scales)
library(ggthemes)
library(RColorBrewer)
library(viridis)
library(grid)
library(gridExtra)
library(ggimage)
library(png)
library(gridGraphics)
library(dplyr)
library(tidyr)
#devtools::install_github('bart6114/artyfarty')
library('artyfarty')
library(tm)
library(wordcloud)
clutch = read.csv('C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fetched.csv')
#number of games played vs number of wins
df1 = clutch[,c('GP','W','team')]
df1= gather(df1,type,count,-team)
#df1$count <- ifelse(df1$type =="W",df1$count*(-1),df1$count)
temp = df1[df1$type=='GP',]
new_levels= as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
#df1 <- within(df1, team <- factor(team, levels=names(sort(count, decreasing=TRUE))))
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
geom_bar(stat="identity",position="identity")+
xlab("number of games")+ylab("name of teams")+
scale_fill_manual(name="type of games",values = pal("five38"))+
coord_flip()+ggtitle("number of games played (GP) v.s number of wins (W)")+
geom_hline(yintercept=0)+
ylab("number of games")+
xlab("team name")+
scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
theme_scientific()
#Personal fouls (PF) and turnovers (TOV)
df1 = clutch[,c('PF','TOV','team')]
df1= gather(df1,type,count,-team)
df1$count <- ifelse(df1$type =="PF",df1$count*(-1),df1$count)
temp = temp = df1[df1$type=='TOV',]
new_levels= as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
#df1 <- within(df1, team <- factor(team, levels=names(sort(count, decreasing=TRUE))))
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
geom_bar(stat="identity",position="identity")+
xlab("counts")+ylab("name of teams")+
scale_fill_manual(values = pal("five38"))+
coord_flip()+ggtitle("Personal fouls (PF) and turnovers (TOV)")+
geom_hline(yintercept=0)+
ylab("counts")+
xlab("team name")+
scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
theme_scientific()
# divergent plot
df1 = clutch[,c('PCT_PTS_2PT','PCT_PTS_3PT','PCT_PTS_FT','team')]
df1= gather(df1,type,count,-team)
temp = df1[df1$type=='PCT_PTS_2PT',]
new_levels= as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
df1$count <- ifelse(df1$type =="PCT_PTS_2PT",df1$count*(-1),df1$count)
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
geom_col()+
xlab("percentage")+ylab("name of teams")+
scale_fill_manual(values = pal("five38"))+
coord_flip()+ggtitle("2PT%,3PT%,FT%")+
geom_hline(yintercept=0)+
ylab("percentage")+
xlab("team name")+
scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
theme_scientific()
path = 'https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/'
#img <- "https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/ATL.png?raw=true"
df1 = clutch[,c('OFF_RATING','DEF_RATING','team')]
df1$img = paste(path,df1$team,'.png?raw=true',sep='')
ggplot(df1,aes(x=OFF_RATING,y=DEF_RATING))+geom_point()+
scale_y_reverse()+geom_image(image = df1$img, size = .05)+
theme_scientific()+
ggtitle("offensive rating v.s. defensive rating")+
xlab('offensive rating')+ylab('defensive rating')
### Part 2
## Preprocess data to merge with the team
df_name_team = read.csv(file="C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Name_Team.csv")
df_name_team = df_name_team[,c("PERSON_ID","Team_Name")]
colnames(df_name_team)[1] = "player_id"
my_read = function(path,team=df_name_team){
temp = read.csv(file=path)
final = merge(temp,team,by = "player_id",all=TRUE)
return(final[ ,!(colnames(final) == "X")])
}
df_3pct = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/3pct_df.csv")
df_3fgm = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/3fgm_df.csv")
df_3 = merge(df_3fgm,df_3pct,by = "player_id",all=TRUE)
df_pct = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/pct_df.csv")
df_fgm = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fgm_df.csv")
df_all = merge(df_fgm,df_pct,by = "player_id",all=TRUE)
df_pts = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/pts_df.csv")
df_fta = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fta_df.csv")
df_fct = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fct_df.csv")
df_ftm = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/ftm_df.csv")
# Define FGA: Field Goal Attempt
FGA = df_fgm$overall / df_fct$overall
# Define TSP: True shooting percent
TSP = df_pts$overall/(2*(FGA+0.44*df_fta$overall))
df_pts['TSP'] = TSP
# Make a copy of df_pts
df_pts_v1 = df_pts
# Subset to remove all the NAs due to players that did not have a team or did not play in 2016
df_pts_v1_2 = df_pts_v1[!is.na(df_pts_v1$TSP),]
p_TSP = ggplot(df_pts_v1_2)+
geom_point(aes(overall,TSP,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "TSP V.S PTS ",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP)
p_TSP_All = ggplot(df_pts_v1_2)+
geom_point(aes(overall,TSP,color = player_name,shape = Team_Name),size = 2)+
labs(title = "TSP V.S PTS ",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP_All)
df_pct['df_fgm_overall']=df_fgm$overall
df_pct_v1 = df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]
p_FGMPCT = ggplot(df_pct_v1_2)+
geom_point(aes(df_fgm_overall,overall,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "pct_overall V.S fgm_overall ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(df_pct_v1_2)+
geom_point(aes(df_fgm_overall,overall,color = player_name,shape = Team_Name),size = 2)+
labs(title = "pct_overall V.S fgm_overall ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT_All)
df_3pct['df_3fgm_overall']=df_3fgm$overall
df_pct3_v1 = df_3pct
df_pct3_v1_2 = df_pct3_v1[!is.na(df_3fgm$player_name),]
p_3FGM3PCT = ggplot(df_pct3_v1_2)+
geom_point(aes(df_3fgm_overall,overall,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "3pct_overall V.S 3fgm_overall ",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT)
p_3FGM3PCT_All = ggplot(df_pct3_v1_2)+
geom_point(aes(df_3fgm_overall,overall,color = player_name,shape = Team_Name),size = 2)+
labs(title = "3pct_overall V.S 3fgm_overall ",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT_All)
df_fta['df_ftm_30sec_plusmiuns_5'] = df_ftm$X30sec_plusminus_5
df_fta_v1 = df_fta
df_fta_v1_2 = df_fta_v1[!is.na(df_fta$player_name),]
p_fta_ftm = ggplot(df_fta_v1_2)+
geom_point(aes(X30sec_plusminus_5,df_ftm_30sec_plusmiuns_5,color = player_name),size = 1)+
facet_wrap(~Team_Name)+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(df_fta_v1_2)+
geom_point(aes(X30sec_plusminus_5,
df_ftm_30sec_plusmiuns_5,
color = player_name,
shape=Team_Name),
size = 1.3,
alpha=0.5)+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(df_fta_v1_2)+
geom_point(aes(X30sec_plusminus_5,
df_ftm_30sec_plusmiuns_5,
color = player_name,
shape=Team_Name),
size = 1.3,
alpha=0.5,
position = "jitter")+
labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
# average within group 3point
cbP = c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
df_3fgm_sum = aggregate(df_3fgm[,3:12], list(df_3fgm$Team_Name), sum, na.rm = TRUE)
deno = df_3fgm/df_3pct[,1:13]
deno$player_name = df_3fgm$player_name
deno$player_id = df_3fgm$player_id
deno$Team_Name = df_3fgm$Team_Name
deno_modi = aggregate(deno[,3:12], list(deno$Team_Name), sum, na.rm = TRUE)
average3point = df_3fgm_sum/deno_modi
average3point$Group.1=deno_modi$Group.1
average3point[is.na(average3point)] = 0
TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs",
"Lakers","Suns","76ers","Nets")
TopLow3point = average3point[average3point$Group.1 %in% TopLowTeam,]
p1 = ggparcoord(TopLow3point,
columns = 2:7,
groupColumn ='Group.1',
scale = 'globalminmax')+
geom_vline(xintercept = 0:6, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average 3PT Last Xmin yDown Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind()
p2 = ggparcoord(TopLow3point,
columns = c(2,8:10),
groupColumn ='Group.1',
scale = 'globalminmax')+
geom_vline(xintercept = 0:5, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average 3PT Last Xmin yDownorHiger Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind()
# average within group all point
cbP = c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
df_fgm_sum = aggregate(df_fgm[,3:12], list(df_fgm$Team_Name), sum, na.rm = TRUE)
deno = df_fgm/df_pct[,1:13]
deno$player_name = df_fgm$player_name
deno$player_id = df_fgm$player_id
deno$Team_Name = df_fgm$Team_Name
deno_modi = aggregate(deno[,3:12], list(deno$Team_Name), sum, na.rm = TRUE)
averagepoint = df_fgm_sum/deno_modi
averagepoint$Group.1=deno_modi$Group.1
averagepoint[is.na(averagepoint)] = 0
TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs",
"Lakers","Suns","76ers","Nets")
TopLowpoint = averagepoint[averagepoint$Group.1 %in% TopLowTeam,]
#averagepoint
p3 = ggparcoord(TopLowpoint,
columns = 2:6,
groupColumn ='Group.1',
scale = 'globalminmax')+
geom_vline(xintercept = 0:5, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average TotalPT Last Xmin yDown Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind()
p4 = ggparcoord(TopLowpoint,
columns = c(2,7:10),
groupColumn ='Group.1',
scale = 'globalminmax')+
geom_vline(xintercept = 0:5, color = "lightblue")+
theme(axis.text.x=element_text(angle=90))+
labs(title = "Average TotalPT Last Xmin yDownorHiger Top4 V.S Low4",x = 'Indicator', y='Team Average')+
scale_colour_colorblind()
grid.arrange(p1, p2, p3, p4, nrow = 2)
path = 'https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/'
averagepoint$img = paste(path,averagepoint$Group.1,'.png?raw=true',sep='')
rmv_0_averagepoint = averagepoint[2:31,]
average3point$img = paste(path,average3point$Group.1,'.png?raw=true',sep='')
rmv_0_average3point = average3point[2:31,]
p1 = ggplot(rmv_0_averagepoint,aes(overall,X10sec_down_3))+
geom_point()+
geom_image(image = df1$img, size = .05)+
theme_scientific()+
labs(title = "3pt Average 10sec_down_3 v.s. Overall",x = 'Overall', y='X10sec_down_3')
p2 = ggplot(rmv_0_average3point,aes(overall,X10sec_down_3))+
geom_point()+
geom_image(image = df1$img, size = .05)+
theme_scientific()+
labs(title = "Total Average X10sec_down_3 v.s. Overall",x = 'Overall', y='X10sec_down_3')
grid.arrange(p1, p2, nrow = 1)
ggplot()+
geom_point(data =df_pct,
aes(x = X1min_down_5, y= overall),
position = position_jitter(w = 0.01, h = 0.02),
alpha = 0.5,
size = 3)+
facet_wrap(~Team_Name)+
labs(title = "overall V.S X1min_down_5",
x = 'X1min_down_5',
y='overall')
pp = ggplot()+
geom_point(data =df_all,
aes(x = X5min_plusminus_5.x, y= X5min_plusminus_5.y,color = player_name.x),
position = position_jitter(w = 0.01, h = 0.02),
alpha = 0.5,
size = 2)+
facet_wrap(~Team_Name.x)+
labs(title = "5min_plusminus_5_percent V.S X5min_plusminus_5_actual",
x = 'X5min_plusminus_5_actual',
y='5min_plusminus_5_percent')
ggplotly(pp)
pairs(df_all[c("X10sec_down_3.x","X10sec_down_3.y","X30sec_down_3.x","X30sec_down_3.y")])
#df_all
pairs(df_all[c("X1min_down_5.x","X1min_down_5.y",
"X3min._down_5.x","X3min._down_5.y",
"X5min._down_5.x","X5min._down_5.y")])
#df_all
pairs(df_all[c("X30sec_plusminus_5.x","X30sec_plusminus_5.y",
"X1min_plusminus_5.x","X1min_plusminus_5.y",
"X3min_plusminus_5.x","X3min_plusminus_5.y")])
df_all$Team_Name.x = as.factor(df_all$Team_Name.x)
countorder = df_all %>% group_by(Team_Name.x) %>% summarize(av=mean(overall.x, na.rm=TRUE))
#df_all = merge(df_fgm,df_pct,by = "player_id",all=TRUE)
ggplot(countorder, aes(reorder(Team_Name.x,av),av)) +
geom_col(color = "tomato", fill = "orange", alpha = .2)+
coord_flip()+
theme_scientific()+
labs(title = "Team Average Overall fgm",x = 'Team', y='Average Overall fgm')
### Part4
#TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs","Lakers","Suns","76ers","Nets")
tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/tweet_content.txt")
Spurs_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/Spurs.txt")
Warriors_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/Warriors.txt")
Lakers_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/Lakers.txt")
T76ers_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/76ers.txt")
My_word_cloud = function(tweet_content,min_freq){
docs = Corpus(VectorSource(tweet_content)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace) %>%
tm_map(PlainTextDocument)
tdm = TermDocumentMatrix(docs) %>%
as.matrix()
content = as.matrix(tdm[,1])
content = as.matrix(content[order(content, decreasing=TRUE),])
print("head(Whole twitter)")
print(head(content))
print("Whole twitter's most occuring words:")
print(head(rownames(content)))
pal <- brewer.pal(9, "YlGnBu")
pal <- pal[-(1:3)]
wordcloud(rownames(content), content, min.freq =min_freq, scale=c(5, .2), random.order = FALSE, random.color = FALSE, colors= pal)
}
## Let's look at what is going on if we plot the twitter!
My_word_cloud(tweet_content = tweet_content,min_freq=100)
## [1] "head(Whole twitter)"
## [,1]
## ontnt 1539
## warriors 1367
## pts 1337
## nba 1238
## spurs 1160
## player 1072
## [1] "Whole twitter's most occuring words:"
## [1] "ontnt" "warriors" "pts" "nba" "spurs" "player"
## Let's look at what is going on WITH TEAM
splited_Spurs = strsplit(Spurs_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 21)
## [1] "head(Whole twitter)"
## [,1]
## spurs 2476
## man 414
## utd 323
## beat 286
## warriors 229
## game 227
## [1] "Whole twitter's most occuring words:"
## [1] "spurs" "man" "utd" "beat" "warriors" "game"
splited_Spurs = strsplit(Warriors_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 21)
## [1] "head(Whole twitter)"
## [,1]
## warriors 2248
## curry 256
## game 255
## spurs 232
## will 215
## stephen 187
## [1] "Whole twitter's most occuring words:"
## [1] "warriors" "curry" "game" "spurs" "will" "stephen"
splited_Spurs = strsplit(Lakers_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 21)
## [1] "head(Whole twitter)"
## [,1]
## lakers 908
## kawhi 224
## leonard 131
## lebron 95
## trade 92
## los 91
## [1] "Whole twitter's most occuring words:"
## [1] "lakers" "kawhi" "leonard" "lebron" "trade" "los"
splited_Spurs = strsplit(T76ers_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)
tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])
sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 10)
## [1] "head(Whole twitter)"
## [,1]
## ers 532
## heat 117
## game 98
## nba 87
## embiid 85
## philadelphia 82
## [1] "Whole twitter's most occuring words:"
## [1] "ers" "heat" "game" "nba"
## [5] "embiid" "philadelphia"